{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Pyspark Basics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "#create pyspark session\n",
    "from pyspark.sql import SparkSession\n",
    "spark=SparkSession.builder.appName('pyspark').getOrCreate()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 121,
   "metadata": {},
   "outputs": [],
   "source": [
    "#read the data file\n",
    "df=spark.read.csv('conversion_data.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 122,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+---+--------+------+-------------------+---------+\n",
      "|    _c0|_c1|     _c2|   _c3|                _c4|      _c5|\n",
      "+-------+---+--------+------+-------------------+---------+\n",
      "|country|age|new_user|source|total_pages_visited|converted|\n",
      "|     UK| 25|       1|   Ads|                  1|        0|\n",
      "|     US| 23|       1|   Seo|                  5|        0|\n",
      "|     US| 28|       1|   Seo|                  4|        0|\n",
      "|  China| 39|       1|   Seo|                  5|        0|\n",
      "+-------+---+--------+------+-------------------+---------+\n",
      "only showing top 5 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df.show(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 123,
   "metadata": {},
   "outputs": [],
   "source": [
    "#read the data file\n",
    "df=spark.read.csv('conversion_data.csv',header=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 124,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+---+--------+------+-------------------+---------+\n",
      "|country|age|new_user|source|total_pages_visited|converted|\n",
      "+-------+---+--------+------+-------------------+---------+\n",
      "|     UK| 25|       1|   Ads|                  1|        0|\n",
      "|     US| 23|       1|   Seo|                  5|        0|\n",
      "|     US| 28|       1|   Seo|                  4|        0|\n",
      "|  China| 39|       1|   Seo|                  5|        0|\n",
      "|     US| 30|       1|   Seo|                  6|        0|\n",
      "+-------+---+--------+------+-------------------+---------+\n",
      "only showing top 5 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df.show(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 125,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- country: string (nullable = true)\n",
      " |-- age: string (nullable = true)\n",
      " |-- new_user: string (nullable = true)\n",
      " |-- source: string (nullable = true)\n",
      " |-- total_pages_visited: string (nullable = true)\n",
      " |-- converted: string (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 126,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+-------+------------------+-------------------+------+-------------------+-------------------+\n",
      "|summary|country|               age|           new_user|source|total_pages_visited|          converted|\n",
      "+-------+-------+------------------+-------------------+------+-------------------+-------------------+\n",
      "|  count| 316200|            316200|             316200|316200|             316200|             316200|\n",
      "|   mean|   null|30.569857685009488| 0.6854648956356736|  null|  4.872966476913346|0.03225806451612903|\n",
      "| stddev|   null| 8.271801801807728|0.46433119036384723|  null|  3.341103757948214|0.17668497535763514|\n",
      "|    min|  China|               111|                  0|   Ads|                  1|                  0|\n",
      "|    max|     US|                79|                  1|   Seo|                  9|                  1|\n",
      "+-------+-------+------------------+-------------------+------+-------------------+-------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "#statistical summary for data numerical columns\n",
    "df.describe().show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Datatypes "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 130,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyspark.sql.functions import col , column\n",
    "df = df.withColumn(\"age\", col(\"age\").cast(\"Int\"))\\\n",
    "    .withColumn(\"new_user\", col(\"new_user\").cast(\"Int\"))\\\n",
    "    .withColumn(\"total_pages_visited\", col(\"total_pages_visited\").cast(\"Int\"))\\\n",
    "    .withColumn(\"converted\", col(\"converted\").cast(\"Int\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 131,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- country: string (nullable = true)\n",
      " |-- age: integer (nullable = true)\n",
      " |-- new_user: integer (nullable = true)\n",
      " |-- source: string (nullable = true)\n",
      " |-- total_pages_visited: integer (nullable = true)\n",
      " |-- converted: integer (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 132,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+---+--------+------+-------------------+---------+\n",
      "|country|age|new_user|source|total_pages_visited|converted|\n",
      "+-------+---+--------+------+-------------------+---------+\n",
      "|     UK| 25|       1|   Ads|                  1|        0|\n",
      "|     US| 23|       1|   Seo|                  5|        0|\n",
      "|     US| 28|       1|   Seo|                  4|        0|\n",
      "|  China| 39|       1|   Seo|                  5|        0|\n",
      "|     US| 30|       1|   Seo|                  6|        0|\n",
      "+-------+---+--------+------+-------------------+---------+\n",
      "only showing top 5 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df.show(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 133,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Column<b'country'>"
      ]
     },
     "execution_count": 133,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#acess dataframe column , we get column object \n",
    "df['country']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 134,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "pyspark.sql.column.Column"
      ]
     },
     "execution_count": 134,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "type(df['country'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 135,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+\n",
      "|country|\n",
      "+-------+\n",
      "|     UK|\n",
      "|     US|\n",
      "|     US|\n",
      "|  China|\n",
      "|     US|\n",
      "+-------+\n",
      "only showing top 5 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "#access content of colum\n",
    "df.select('country').show(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 136,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+------+\n",
      "|country|source|\n",
      "+-------+------+\n",
      "|     UK|   Ads|\n",
      "|     US|   Seo|\n",
      "|     US|   Seo|\n",
      "|  China|   Seo|\n",
      "|     US|   Seo|\n",
      "+-------+------+\n",
      "only showing top 5 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "#acess multiple columns\n",
    "df.select(['country','source']).show(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Add or Remove column "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### using udf (user defined functions)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 137,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyspark.sql.types import StringType\n",
    "from pyspark.sql.functions import udf\n",
    "\n",
    "def country_udf(country):\n",
    "    if country =='UK':\n",
    "        return 'Britain'\n",
    "    elif country =='US':\n",
    "        return 'USA'\n",
    "    elif country =='China':\n",
    "        return 'Asia'\n",
    "    elif country =='Germany':\n",
    "        return 'Deustche'\n",
    "    else:\n",
    "        return country\n",
    "        \n",
    "spark_udf = udf(country_udf, StringType())\n",
    "\n",
    "df=df.withColumn(\"country_new\", spark_udf(df.country))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 138,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+---+--------+------+-------------------+---------+-----------+\n",
      "|country|age|new_user|source|total_pages_visited|converted|country_new|\n",
      "+-------+---+--------+------+-------------------+---------+-----------+\n",
      "|     UK| 25|       1|   Ads|                  1|        0|    Britain|\n",
      "|     US| 23|       1|   Seo|                  5|        0|        USA|\n",
      "|     US| 28|       1|   Seo|                  4|        0|        USA|\n",
      "|  China| 39|       1|   Seo|                  5|        0|       Asia|\n",
      "|     US| 30|       1|   Seo|                  6|        0|        USA|\n",
      "|     US| 31|       0|   Seo|                  1|        0|        USA|\n",
      "|  China| 27|       1|   Seo|                  4|        0|       Asia|\n",
      "|     US| 23|       0|   Ads|                  4|        0|        USA|\n",
      "|     UK| 29|       0|Direct|                  4|        0|    Britain|\n",
      "|     US| 25|       0|   Ads|                  2|        0|        USA|\n",
      "+-------+---+--------+------+-------------------+---------+-----------+\n",
      "only showing top 10 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df.show(10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### without using udf "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 139,
   "metadata": {},
   "outputs": [],
   "source": [
    "#create new column with age +2  value\n",
    "df=df.withColumn('new_age',df['age'] +2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 140,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+---+--------+------+-------------------+---------+-----------+-------+\n",
      "|country|age|new_user|source|total_pages_visited|converted|country_new|new_age|\n",
      "+-------+---+--------+------+-------------------+---------+-----------+-------+\n",
      "|     UK| 25|       1|   Ads|                  1|        0|    Britain|     27|\n",
      "|     US| 23|       1|   Seo|                  5|        0|        USA|     25|\n",
      "|     US| 28|       1|   Seo|                  4|        0|        USA|     30|\n",
      "|  China| 39|       1|   Seo|                  5|        0|       Asia|     41|\n",
      "|     US| 30|       1|   Seo|                  6|        0|        USA|     32|\n",
      "|     US| 31|       0|   Seo|                  1|        0|        USA|     33|\n",
      "|  China| 27|       1|   Seo|                  4|        0|       Asia|     29|\n",
      "|     US| 23|       0|   Ads|                  4|        0|        USA|     25|\n",
      "|     UK| 29|       0|Direct|                  4|        0|    Britain|     31|\n",
      "|     US| 25|       0|   Ads|                  2|        0|        USA|     27|\n",
      "+-------+---+--------+------+-------------------+---------+-----------+-------+\n",
      "only showing top 10 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df.show(10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Drop /Delete columns "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 141,
   "metadata": {},
   "outputs": [],
   "source": [
    "#delete the new_age column\n",
    "df=df.drop('new_age')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 142,
   "metadata": {},
   "outputs": [],
   "source": [
    "#delete the country_new column\n",
    "df=df.drop('country_new')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 143,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+---+--------+------+-------------------+---------+\n",
      "|country|age|new_user|source|total_pages_visited|converted|\n",
      "+-------+---+--------+------+-------------------+---------+\n",
      "|     UK| 25|       1|   Ads|                  1|        0|\n",
      "|     US| 23|       1|   Seo|                  5|        0|\n",
      "|     US| 28|       1|   Seo|                  4|        0|\n",
      "|  China| 39|       1|   Seo|                  5|        0|\n",
      "|     US| 30|       1|   Seo|                  6|        0|\n",
      "+-------+---+--------+------+-------------------+---------+\n",
      "only showing top 5 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df.show(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Acess row objects of dataframe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 144,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Row(country='UK', age=25, new_user=1, source='Ads', total_pages_visited=1, converted=0),\n",
       " Row(country='US', age=23, new_user=1, source='Seo', total_pages_visited=5, converted=0),\n",
       " Row(country='US', age=28, new_user=1, source='Seo', total_pages_visited=4, converted=0)]"
      ]
     },
     "execution_count": 144,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#access first 3 rows\n",
    "df.head(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 145,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Row(country='UK', age=25, new_user=1, source='Ads', total_pages_visited=1, converted=0)"
      ]
     },
     "execution_count": 145,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#access first row object \n",
    "df.head(3)[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 146,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'UK'"
      ]
     },
     "execution_count": 146,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#access first row object\n",
    "df.head(3)[0][0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Filtering "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 147,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+---+--------+------+-------------------+---------+\n",
      "|country|age|new_user|source|total_pages_visited|converted|\n",
      "+-------+---+--------+------+-------------------+---------+\n",
      "|Germany|123|       0|   Seo|                 15|        1|\n",
      "|     US| 77|       0|Direct|                  4|        0|\n",
      "|     US| 79|       1|Direct|                  1|        0|\n",
      "|     UK|111|       0|   Ads|                 10|        1|\n",
      "+-------+---+--------+------+-------------------+---------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "#filter records where age of user is more than 75 years\n",
    "df.filter(df['age'] >75).show(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 148,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+---------+---+\n",
      "|country|converted|age|\n",
      "+-------+---------+---+\n",
      "|Germany|        1|123|\n",
      "|     US|        0| 77|\n",
      "|     US|        0| 79|\n",
      "|     UK|        1|111|\n",
      "+-------+---------+---+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "#filter records and show only country and converted status of that user\n",
    "df.filter(df['age'] > 75).select(['country','converted','age']).show(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Multiple filter conditions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 149,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+---+--------+------+-------------------+---------+\n",
      "|country|age|new_user|source|total_pages_visited|converted|\n",
      "+-------+---+--------+------+-------------------+---------+\n",
      "|     US| 77|       0|Direct|                  4|        0|\n",
      "|     US| 79|       1|Direct|                  1|        0|\n",
      "+-------+---+--------+------+-------------------+---------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "#select people over 75 years only from US\n",
    "df.filter(df['age'] > 75).filter(df['country'] =='US').show(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 150,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+---+--------+------+-------------------+---------+\n",
      "|country|age|new_user|source|total_pages_visited|converted|\n",
      "+-------+---+--------+------+-------------------+---------+\n",
      "|Germany| 31|       0|Direct|                  2|        1|\n",
      "+-------+---+--------+------+-------------------+---------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "#selet users who have more less than 3 visited pages and are still converted from Germany \n",
    "df.filter(df['total_pages_visited'] < 3).filter(df['converted']==1).filter(df['country'] =='Germany').show(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Count Records "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 151,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "316200"
      ]
     },
     "execution_count": 151,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#total records in df \n",
    "df.count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 152,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+------+\n",
      "|country| count|\n",
      "+-------+------+\n",
      "|Germany| 13056|\n",
      "|  China| 76602|\n",
      "|     US|178092|\n",
      "|     UK| 48450|\n",
      "+-------+------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Frequency count of column values\n",
    "df.groupBy('country').count().show(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 153,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+------+\n",
      "|country| count|\n",
      "+-------+------+\n",
      "|     US|178092|\n",
      "|  China| 76602|\n",
      "|     UK| 48450|\n",
      "|Germany| 13056|\n",
      "+-------+------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "## Ordered Frequency count \n",
    "df.groupBy('country').count().orderBy('count',ascending=False).show(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 154,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---------+------+\n",
      "|converted| count|\n",
      "+---------+------+\n",
      "|        1| 10200|\n",
      "|        0|306000|\n",
      "+---------+------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "#Total converted vs non converted user counts\n",
    "df.groupBy('converted').count().show(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 156,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---------+------------------+------------------+------------------------+--------------+\n",
      "|converted|          avg(age)|     avg(new_user)|avg(total_pages_visited)|avg(converted)|\n",
      "+---------+------------------+------------------+------------------------+--------------+\n",
      "|        1|26.546764705882353|0.2979411764705882|      14.553529411764705|           1.0|\n",
      "|        0|30.703960784313725|0.6983823529411765|       4.550281045751634|           0.0|\n",
      "+---------+------------------+------------------+------------------------+--------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "#Mean value of conversion for each source\n",
    "df.groupBy('converted').mean().show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Collect "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Save the results as a list with row objects\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [],
   "source": [
    "# create a list with only converted users data from China\n",
    "china_data=df.filter((df['country']=='China') & (df['converted'] ==1)).collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Row(country='China', age='24', new_user='0', source='Seo', total_pages_visited='18', converted='1'),\n",
       " Row(country='China', age='26', new_user='1', source='Ads', total_pages_visited='18', converted='1'),\n",
       " Row(country='China', age='30', new_user='0', source='Ads', total_pages_visited='17', converted='1'),\n",
       " Row(country='China', age='26', new_user='0', source='Seo', total_pages_visited='8', converted='1'),\n",
       " Row(country='China', age='33', new_user='1', source='Direct', total_pages_visited='13', converted='1')]"
      ]
     },
     "execution_count": 60,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#view the new list \n",
    "china_data[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [],
   "source": [
    "#view the list object as a dictionary\n",
    "china_dict=china_data[0].asDict()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'24'"
      ]
     },
     "execution_count": 64,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "china_dict['age']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'18'"
      ]
     },
     "execution_count": 65,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "china_dict['total_pages_visited']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Aggregate Functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 158,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------------------+\n",
      "|          avg(age)|\n",
      "+------------------+\n",
      "|30.569857685009488|\n",
      "+------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df.agg({'age':'mean'}).show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 159,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------------------+\n",
      "|     avg(converted)|\n",
      "+-------------------+\n",
      "|0.03225806451612903|\n",
      "+-------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df.agg({'converted':'mean'}).show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 160,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------+\n",
      "|max(age)|\n",
      "+--------+\n",
      "|     123|\n",
      "+--------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df.agg({'age':'max'}).show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 161,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------+\n",
      "|count(country)|\n",
      "+--------------+\n",
      "|        316200|\n",
      "+--------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df.agg({'country':'count'}).show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 162,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------+\n",
      "|min(age)|\n",
      "+--------+\n",
      "|      17|\n",
      "+--------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df.agg({'age':'min'}).show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 163,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+--------+\n",
      "|country|max(age)|\n",
      "+-------+--------+\n",
      "|Germany|     123|\n",
      "|  China|      69|\n",
      "|     US|      79|\n",
      "|     UK|     111|\n",
      "+-------+--------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "## aggregation on grouped data by country\n",
    "df.groupBy('country').agg({'age':'max'}).show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 164,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+------+--------+\n",
      "|country|source|max(age)|\n",
      "+-------+------+--------+\n",
      "|Germany|Direct|      61|\n",
      "|  China|Direct|      65|\n",
      "|     UK|   Ads|     111|\n",
      "|     US|   Seo|      73|\n",
      "|     UK|   Seo|      66|\n",
      "|Germany|   Seo|     123|\n",
      "|Germany|   Ads|      64|\n",
      "|  China|   Seo|      68|\n",
      "|     UK|Direct|      69|\n",
      "|  China|   Ads|      69|\n",
      "|     US|   Ads|      70|\n",
      "|     US|Direct|      79|\n",
      "+-------+------+--------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "## aggregation on grouped data by country,source\n",
    "df.groupBy(['country','source']).agg({'age':'max'}).show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 170,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+---------+------------------------+\n",
      "|country|converted|avg(total_pages_visited)|\n",
      "+-------+---------+------------------------+\n",
      "|Germany|        0|       4.565277777777778|\n",
      "|  China|        1|      14.352941176470589|\n",
      "|  China|        0|      4.5404575163398695|\n",
      "|     US|        0|       4.551785714285714|\n",
      "|     UK|        0|       4.557037037037037|\n",
      "|Germany|        1|      14.572303921568627|\n",
      "|     UK|        1|       14.53450980392157|\n",
      "|     US|        1|      14.561497326203208|\n",
      "+-------+---------+------------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "## aggregation on grouped data by country,converted\n",
    "df.groupBy(['country','converted']).agg({'total_pages_visited':'mean'}).show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
